Import necessary libraries

library(dplyr)
library(funModeling)
library(MASS)
library(gridExtra)
library(tidyverse)
library(corrplot)
library(Hmisc)
library(survival)
library(survminer)
library(ggplot2)
library(ggpubr)
library(magrittr)
library(knitr)
library(rms)
library(foreign)
library(pROC)
library(timeROC)

Data Processing

Import and clean the dataset

# read data file and check if there is a missing value
surv_data = read_csv("Project_2_data.csv")|>
  janitor::clean_names() |>
  rename(regional_node_positive = reginol_node_positive) |>
  mutate(
    positive_ratio = regional_node_positive/regional_node_examined,
    status = ifelse(status == "Alive", 0, 1)
      ) |>
  mutate(
    grade = case_when(
      grade == "1" ~ 1,
      grade == "2" ~ 2,
      grade == "3" ~ 3,
      grade == "anaplastic; Grade IV" ~ 4)) |>
  mutate(across(where(is.character), as.factor))|>
  relocate(status, survival_months, everything())

any(is.na(surv_data)) # FALSE
## [1] FALSE
summary(surv_data) |>
  knitr::kable()
status survival_months age race marital_status t_stage n_stage x6th_stage differentiate grade a_stage tumor_size estrogen_status progesterone_status regional_node_examined regional_node_positive positive_ratio
Min. :0.0000 Min. : 1.0 Min. :30.00 Black: 291 Divorced : 486 T1:1603 N1:2732 IIA :1305 Moderately differentiated:2351 Min. :1.000 Distant : 92 Min. : 1.00 Negative: 269 Negative: 698 Min. : 1.00 Min. : 1.000 Min. :0.02041
1st Qu.:0.0000 1st Qu.: 56.0 1st Qu.:47.00 Other: 320 Married :2643 T2:1786 N2: 820 IIB :1130 Poorly differentiated :1111 1st Qu.:2.000 Regional:3932 1st Qu.: 16.00 Positive:3755 Positive:3326 1st Qu.: 9.00 1st Qu.: 1.000 1st Qu.:0.10345
Median :0.0000 Median : 73.0 Median :54.00 White:3413 Separated: 45 T3: 533 N3: 472 IIIA:1050 Undifferentiated : 19 Median :2.000 NA Median : 25.00 NA NA Median :14.00 Median : 2.000 Median :0.21429
Mean :0.1531 Mean : 71.3 Mean :53.97 NA Single : 615 T4: 102 NA IIIB: 67 Well differentiated : 543 Mean :2.151 NA Mean : 30.47 NA NA Mean :14.36 Mean : 4.158 Mean :0.32647
3rd Qu.:0.0000 3rd Qu.: 90.0 3rd Qu.:61.00 NA Widowed : 235 NA NA IIIC: 472 NA 3rd Qu.:3.000 NA 3rd Qu.: 38.00 NA NA 3rd Qu.:19.00 3rd Qu.: 5.000 3rd Qu.:0.50000
Max. :1.0000 Max. :107.0 Max. :69.00 NA NA NA NA NA NA Max. :4.000 NA Max. :140.00 NA NA Max. :61.00 Max. :46.000 Max. :1.00000

Checking the correlation between numerical variables

cor_matrix = surv_data |>
  select_if(is.numeric) |>
  cor() 
corrplot::corrplot(cor_matrix, 
                   type = "upper",
                   diag = FALSE, 
                   addCoef.col = "black",
                   tl.col = "black",
                   tl.srt = 45)

The correlation matrix shows that the numerical variables in the dataset have low to moderate correlations with each other. The relatively higher correlation among reginal_node_positive, reginal_node_examined, and positive_ratio is expected. In the later analysis, we will only consider positive ratio for model building.

Check the correlation between categorical variables

categorical_vars <- surv_data |>
  select_if(is.factor)

association_test <- function(var1, var2) {
  
  cont_table <- table(var1, var2)
  chi_test <- chisq.test(cont_table)
  n <- sum(cont_table)
  min_dim <- min(dim(cont_table)) - 1
  cramer_v <- sqrt(chi_test$statistic / (n * min_dim))
  
  return(list(
    chi_square = chi_test$statistic,
    p_value = chi_test$p.value,
    cramer_v = cramer_v
  ))
}

results <- list()
var_names <- names(categorical_vars)
for(i in 1:(length(var_names)-1)) {
  for(j in (i+1):length(var_names)) {
    var1 <- var_names[i]
    var2 <- var_names[j]
    
    test_result <- association_test(
      categorical_vars[[var1]], 
      categorical_vars[[var2]]
    )
    
    results[[paste(var1, var2, sep="-")]] <- c(
      test_result,
      list(var1 = var1, var2 = var2)
    )
  }
}

results_df <- do.call(rbind, lapply(results, function(x) {
  data.frame(
    Variable1 = x$var1,
    Variable2 = x$var2,
    Chi_Square = x$chi_square,
    P_Value = x$p_value,
    Cramers_V = x$cramer_v
  )
})) |>
  arrange(P_Value)

kable(results_df, 
      digits = 4,
      col.names = c("var1", "var2", "chi-statistic", "P-value", "Cramer's V"))
var1 var2 chi-statistic P-value Cramer’s V
t_stage-x6th_stage t_stage x6th_stage 6784.0786 0.0000 0.7496
n_stage-x6th_stage n_stage x6th_stage 6686.8341 0.0000 0.9115
estrogen_status-progesterone_status estrogen_status progesterone_status 1054.8431 0.0000 0.5120
x6th_stage-a_stage x6th_stage a_stage 729.1926 0.0000 0.4257
t_stage-a_stage t_stage a_stage 583.2589 0.0000 0.3807
n_stage-a_stage n_stage a_stage 355.5764 0.0000 0.2973
t_stage-n_stage t_stage n_stage 323.4137 0.0000 0.2005
differentiate-estrogen_status differentiate estrogen_status 217.4167 0.0000 0.2324
differentiate-progesterone_status differentiate progesterone_status 147.8031 0.0000 0.1917
x6th_stage-differentiate x6th_stage differentiate 158.0412 0.0000 0.1144
race-marital_status race marital_status 137.9574 0.0000 0.1309
n_stage-differentiate n_stage differentiate 115.5011 0.0000 0.1198
t_stage-differentiate t_stage differentiate 90.9569 0.0000 0.0868
x6th_stage-estrogen_status x6th_stage estrogen_status 52.0046 0.0000 0.1137
n_stage-estrogen_status n_stage estrogen_status 42.5231 0.0000 0.1028
n_stage-progesterone_status n_stage progesterone_status 36.8460 0.0000 0.0957
x6th_stage-progesterone_status x6th_stage progesterone_status 42.4854 0.0000 0.1028
a_stage-estrogen_status a_stage estrogen_status 15.5892 0.0001 0.0622
race-differentiate race differentiate 27.9028 0.0001 0.0589
t_stage-estrogen_status t_stage estrogen_status 19.5499 0.0002 0.0697
race-estrogen_status race estrogen_status 13.4090 0.0012 0.0577
t_stage-progesterone_status t_stage progesterone_status 13.8082 0.0032 0.0586
marital_status-n_stage marital_status n_stage 22.3525 0.0043 0.0527
differentiate-a_stage differentiate a_stage 10.5771 0.0142 0.0513
marital_status-progesterone_status marital_status progesterone_status 11.0469 0.0260 0.0524
marital_status-x6th_stage marital_status x6th_stage 28.1080 0.0307 0.0418
race-progesterone_status race progesterone_status 5.0431 0.0803 0.0354
marital_status-differentiate marital_status differentiate 19.1556 0.0848 0.0398
marital_status-estrogen_status marital_status estrogen_status 7.6903 0.1036 0.0437
marital_status-a_stage marital_status a_stage 7.6585 0.1049 0.0436
a_stage-progesterone_status a_stage progesterone_status 2.3828 0.1227 0.0243
marital_status-t_stage marital_status t_stage 17.1204 0.1451 0.0377
race-n_stage race n_stage 6.0797 0.1933 0.0275
race-t_stage race t_stage 8.4624 0.2061 0.0324
race-x6th_stage race x6th_stage 8.8396 0.3560 0.0331
race-a_stage race a_stage 0.3070 0.8577 0.0087

The correlation analysis of categorical variables in this breast cancer dataset reveals several key patterns. The strongest associations are found between staging-related variables, with differentiate-grade showing perfect correlation (Cramer’s V = 1.0) and n_stage-x6th_stage displaying very strong correlation (Cramer’s V = 0.91). A notable moderate correlation exists between hormone receptor statuses (estrogen_status-progesterone_status, Cramer’s V = 0.51), which aligns with biological understanding. While many other associations are statistically significant (p < 0.05), their weaker Cramer’s V values (< 0.3) suggest limited practical significance.

Check the distribution of numerical variables

surv_data |>
  funModeling::plot_num()

Box-Cox transformation

surv_data = surv_data |>
  mutate(
      positive_ratio_transformed = {
      bc = boxcox(positive_ratio ~ 1, plotit = FALSE)
      lambda = bc$x[which.max(bc$y)]
      if(abs(lambda) < 1e-4) {
        log(positive_ratio)
      } else {
        (positive_ratio^lambda - 1) / lambda
      }
    }
  )
p1 <- ggplot(surv_data, aes(x = positive_ratio)) +
  geom_histogram(fill = "skyblue", bins = 30) +
  labs(title = "Original Distribution",
       x = "Positive Ratio")

p2 <- ggplot(surv_data, aes(x = positive_ratio_transformed)) +
  geom_histogram(fill = "lightgreen", bins = 30) +
  labs(title = "Transformed Distribution",
       x = "Transformed Positive Ratio")

grid.arrange(p1, p2, ncol = 2)

## Model Building

Kaplan-Meier Survival Analysis

survfit(Surv(survival_months,status)~race,data = surv_data) |> 
  ggsurvplot(
    size = 1,
    cex.lab= 2,
    break.time.by = 6,
    xlim = c(0,107),
    axis.title.x =element_text(size=5), 
    axis.title.y = element_text(size=5),
    palette = c("#54A136", "#C757A0", "#3291ca"),
    conf.int = TRUE,
    pval = TRUE,
    risk.table = TRUE,
    xlab = "Follow-up months", 
    ylab="Survival probability ",
    risk.table.col = "strata",
    risk.table.fontsize = 3, 
    legend.labs =  c("White", "Black", "Other" ),
    risk.table.height = 0.3, 
    ggtheme = theme_bw()
    )

survfit(Surv(survival_months,status)~marital_status,data = surv_data) |> 
  ggsurvplot(
    size = 1,
    cex.lab= 2,
    break.time.by = 6,
    xlim = c(0,107),
    axis.title.x =element_text(size=5), 
    axis.title.y = element_text(size=5),
    palette = c("#54A136", "#C757A0", "#3291ca", "#FFA500", "#FF0000"),
    conf.int = TRUE,
    pval = TRUE,
    risk.table = TRUE,
    xlab = "Follow-up months", 
    ylab="Survival probability ",
    risk.table.col = "strata",
    risk.table.fontsize = 3, 
    legend.labs =  c("Married", "Single", "Separated", "Divored", "Widowed"),
    risk.table.height = 0.3, 
    ggtheme = theme_bw()
    )

survfit(Surv(survival_months,status)~t_stage,data = surv_data) |> 
  ggsurvplot(
    size = 1,
    cex.lab= 2,
    break.time.by = 6,
    xlim = c(0,107),
    axis.title.x =element_text(size=5), 
    axis.title.y = element_text(size=5),
    palette = c("#54A136", "#C757A0", "#3291ca", "#FFA500", "#FF0000"),
    conf.int = TRUE,
    pval = TRUE,
    risk.table = TRUE,
    xlab = "Follow-up months", 
    ylab="Survival probability ",
    risk.table.col = "strata",
    risk.table.fontsize = 3, 
    legend.labs =  c("T1", "T2", "T3", "T4"),
    risk.table.height = 0.3, 
    ggtheme = theme_bw()
    )

survfit(Surv(survival_months,status)~n_stage,data = surv_data) |> 
  ggsurvplot(
    size = 1,
    cex.lab= 2,
    break.time.by = 6,
    xlim = c(0,107),
    axis.title.x =element_text(size=5), 
    axis.title.y = element_text(size=5),
    palette = c("#54A136", "#C757A0", "#3291ca"),
    conf.int = TRUE,
    pval = TRUE,
    risk.table = TRUE,
    xlab = "Follow-up months", 
    ylab="Survival probability ",
    risk.table.col = "strata",
    risk.table.fontsize = 3, 
    legend.labs =  c("N1", "N2", "N3"),
    risk.table.height = 0.3, 
    ggtheme = theme_bw()
    )

survfit(Surv(survival_months,status)~x6th_stage,data = surv_data) |> 
  ggsurvplot(
    size = 1,
    cex.lab= 2,
    break.time.by = 6,
    xlim = c(0,107),
    axis.title.x =element_text(size=5), 
    axis.title.y = element_text(size=5),
    palette = c("#54A136", "#C757A0", "#3291ca", "#FFA500", "#FF0000"),
    conf.int = TRUE,
    pval = TRUE,
    risk.table = TRUE,
    xlab = "Follow-up months", 
    ylab="Survival probability ",
    risk.table.col = "strata",
    risk.table.fontsize = 3, 
    legend.labs =  c("IIA", "IIB", "IIIA","IIIB", "IIIC"),
    risk.table.height = 0.3, 
    ggtheme = theme_bw()
    )

survfit(Surv(survival_months,status)~differentiate,data = surv_data) |> 
  ggsurvplot(
    size = 1,
    cex.lab= 2,
    break.time.by = 6,
    xlim = c(0,107),
    axis.title.x =element_text(size=5), 
    axis.title.y = element_text(size=5),
    palette = c("#54A136", "#C757A0", "#3291ca", "#FFA500"),
    conf.int = TRUE,
    pval = TRUE,
    risk.table = TRUE,
    xlab = "Follow-up months", 
    ylab="Survival probability ",
    risk.table.col = "strata",
    risk.table.fontsize = 3, 
    legend.labs =  c("Moderately differentiated", "Poorly differentiated", "Well differentiated", "Undifferentiated"),
    risk.table.height = 0.3, 
    ggtheme = theme_bw()
    )

survfit(Surv(survival_months,status)~grade,data = surv_data) |> 
  ggsurvplot(
    size = 1,
    cex.lab= 2,
    break.time.by = 6,
    xlim = c(0,107),
    axis.title.x =element_text(size=5), 
    axis.title.y = element_text(size=5),
    palette = c("#54A136", "#C757A0", "#3291ca", "#FFA500"),
    conf.int = TRUE,
    pval = TRUE,
    risk.table = TRUE,
    xlab = "Follow-up months", 
    ylab="Survival probability ",
    risk.table.col = "strata",
    risk.table.fontsize = 3, 
    legend.labs =  c("1", "2", "3", "4"),
    risk.table.height = 0.3, 
    ggtheme = theme_bw()
    )

survfit(Surv(survival_months,status)~a_stage,data = surv_data) |> 
  ggsurvplot(
    size = 1,
    cex.lab= 2,
    break.time.by = 6,
    xlim = c(0,107),
    axis.title.x =element_text(size=5), 
    axis.title.y = element_text(size=5),
    palette = c("#54A136", "#C757A0"),
    conf.int = TRUE,
    pval = TRUE,
    risk.table = TRUE,
    xlab = "Follow-up months", 
    ylab="Survival probability ",
    risk.table.col = "strata",
    risk.table.fontsize = 3, 
    legend.labs =  c("Distant","Regional"),
    risk.table.height = 0.3, 
    ggtheme = theme_bw()
    )

survfit(Surv(survival_months,status)~estrogen_status,data = surv_data) |> 
  ggsurvplot(
    size = 1,
    cex.lab= 2,
    break.time.by = 6,
    xlim = c(0,107),
    axis.title.x =element_text(size=5), 
    axis.title.y = element_text(size=5),
    palette = c("#54A136", "#C757A0"),
    conf.int = TRUE,
    pval = TRUE,
    risk.table = TRUE,
    xlab = "Follow-up months", 
    ylab="Survival probability ",
    risk.table.col = "strata",
    risk.table.fontsize = 3, 
    legend.labs =  c("Positive","Negative"),
    risk.table.height = 0.3, 
    ggtheme = theme_bw()
    )

survfit(Surv(survival_months,status)~progesterone_status,data = surv_data) |> 
  ggsurvplot(
    size = 1,
    cex.lab= 2,
    break.time.by = 6,
    xlim = c(0,107),
    axis.title.x =element_text(size=5), 
    axis.title.y = element_text(size=5),
    palette = c("#54A136", "#C757A0"),
    conf.int = TRUE,
    pval = TRUE,
    risk.table = TRUE,
    xlab = "Follow-up months", 
    ylab="Survival probability ",
    risk.table.col = "strata",
    risk.table.fontsize = 3, 
    legend.labs =  c("Positive","Negative"),
    risk.table.height = 0.3, 
    ggtheme = theme_bw()
    )

surv_data = surv_data |>
  mutate(age_group = ifelse(age > mean(age, na.rm = TRUE), "Elder", "Young"))

survfit(Surv(survival_months,status)~age_group,data = surv_data) |>
  ggsurvplot(
    size = 1,
    cex.lab= 2,
    break.time.by = 6,
    xlim = c(0,107),
    axis.title.x =element_text(size=5), 
    axis.title.y = element_text(size=5),
    palette = c("#54A136", "#C757A0"),
    conf.int = TRUE,
    pval = TRUE,
    risk.table = TRUE,
    xlab = "Follow-up months", 
    ylab="Survival probability ",
    risk.table.col = "strata",
    risk.table.fontsize = 3, 
    legend.labs =  c("Young","Elder"),
    risk.table.height = 0.3, 
    ggtheme = theme_bw()
    )

surv_data <- surv_data |>
  mutate(tumor_size_group = case_when(
    tumor_size < 40 ~ "Small (<40)",
    tumor_size >= 40 & tumor_size < 80 ~ "Medium (40-80)",
    tumor_size >= 80 & tumor_size < 120 ~ "Large (80-120)",
    tumor_size >= 120 ~ "Very Large (>120)"
  ))

 survfit(Surv(survival_months,status)~tumor_size_group,data = surv_data) |>
  ggsurvplot(
    size = 1,
    cex.lab= 2,
    break.time.by = 6,
    xlim = c(0,107),
    axis.title.x =element_text(size=5), 
    axis.title.y = element_text(size=5),
    palette = c("#54A136", "#C757A0", "#3291ca", "#FFA500"),
    conf.int = TRUE,
    pval = TRUE,
    risk.table = TRUE,
    xlab = "Follow-up months", 
    ylab="Survival probability ",
    risk.table.col = "strata",
    risk.table.fontsize = 3, 
    legend.labs =  c("Small (<40)", "Medium (40-80)", "Large (80-120)", "Very Large (>120)"),
    risk.table.height = 0.3, 
    ggtheme = theme_bw()
    )

surv_data <- surv_data |>
  mutate(positive_ratio_group = case_when(
    positive_ratio <= 0.25 ~ "Low (0.02-0.25)",
    positive_ratio > 0.25 & positive_ratio <= 0.5 ~ "Moderate (0.25-0.5)",
    positive_ratio > 0.5 & positive_ratio <= 0.75 ~ "High (0.5-0.75)",
    positive_ratio > 0.75 ~ "Very High (0.75-1)"
  ))

 survfit(Surv(survival_months,status)~positive_ratio_group,data = surv_data) |>
  ggsurvplot(
    size = 1,
    cex.lab= 2,
    break.time.by = 6,
    xlim = c(0,107),
    axis.title.x =element_text(size=5), 
    axis.title.y = element_text(size=5),
    palette = c("#54A136", "#C757A0", "#3291ca", "#FFA500"),
    conf.int = TRUE,
    pval = TRUE,
    risk.table = TRUE,
    xlab = "Follow-up months", 
    ylab="Survival probability ",
    risk.table.col = "strata",
    risk.table.fontsize = 3, 
    legend.labs =  c("Low (0.02-0.25)", "Moderate (0.25-0.5)", "High (0.5-0.75)", "Very High (0.75-1)"),
    risk.table.height = 0.3, 
    ggtheme = theme_bw()
    )

- All the Kaplan-Meier survival analyses show significant differences in survival probabilities among different groups of each variable.[ Log-rank test: p-value < 0.05]

Univariate Cox Regression Model

\(h(t) = h_0(t) \exp(\beta_1 X)\)

surv_object <- with(surv_data, Surv(survival_months, status))

result_unicox <- data.frame("Variable" = character(),
                     "Hazard Ratio" = numeric(),
                     "95%CI" = character(),
                     "P value" = numeric())

for (variable in c("age", "race", "marital_status", "t_stage", "n_stage", "x6th_stage", "differentiate", "grade", "a_stage", "tumor_size", "estrogen_status", "progesterone_status", "positive_ratio_transformed")) {
  cox_formula <- as.formula(paste("surv_object ~", variable))
  cox_model <- coxph(cox_formula, data = surv_data)
  
  variable_name <- variable
  hazard_ratio <- exp(coef(cox_model))
  ci_lower <- exp(confint(cox_model))[1]
  ci_upper <- exp(confint(cox_model))[2]
  p_value <- summary(cox_model)$coefficients[5]
  
  new_row <- data.frame("Variable" = variable_name,
                        "Hazard Ratio" = hazard_ratio,
                        "95% CI" = paste0("(", round(ci_lower, 2), "-", round(ci_upper, 2), ")"),
                        "P value" = p_value,
                        stringsAsFactors = FALSE)
  
  result_unicox <- rbind(result_unicox, new_row)
}

result_unicox = result_unicox |>
  arrange(desc(`P.value`))


knitr::kable(result_unicox, 
             col.names = c("Variable", "Hazard Ratio", "95% CI", "P value"))
Variable Hazard Ratio 95% CI P value
differentiatePoorly differentiated differentiate 1.9333973 (1.64-2.14) 4.1456318
differentiateUndifferentiated differentiate 4.1456318 (1.64-2.14) 4.1456318
differentiateWell differentiated differentiate 0.5472506 (1.64-2.14) 4.1456318
t_stageT2 t_stage 1.8304505 (1.51-1.89) 2.4047618
t_stageT3 t_stage 2.4047618 (1.51-1.89) 2.4047618
t_stageT4 t_stage 4.6187727 (1.51-1.89) 2.4047618
x6th_stageIIB x6th_stage 1.6858472 (1.3-2) 1.6858472
x6th_stageIIIA x6th_stage 2.5593598 (1.3-2) 1.6858472
x6th_stageIIIB x6th_stage 4.4634086 (1.3-2) 1.6858472
x6th_stageIIIC x6th_stage 6.3473591 (1.3-2) 1.6858472
marital_statusMarried marital_status 0.7155876 (0.57-1.22) 0.7155876
marital_statusSeparated marital_status 2.1134070 (0.57-1.22) 0.7155876
marital_statusSingle marital_status 0.9140521 (0.57-1.22) 0.7155876
marital_statusWidowed marital_status 1.1476127 (0.57-1.22) 0.7155876
raceOther race 0.3686767 (0.24-0.43) 0.2098027
raceWhite race 0.5489746 (0.24-0.43) 0.2098027
n_stageN2 n_stage 2.1583532 (1.78-3.83) 0.0988284
n_stageN3 n_stage 4.6243027 (1.78-3.83) 0.0988284
age age 1.0157825 (1.01-1.03) 0.0007084
a_stageRegional a_stage 0.3198659 (0.23-0.45) 0.0000000
tumor_size tumor_size 1.0134463 (1.01-1.02) 0.0000000
grade grade 1.9206998 (1.69-2.18) 0.0000000
progesterone_statusPositive progesterone_status 0.3842797 (0.32-0.45) 0.0000000
estrogen_statusPositive estrogen_status 0.2726864 (0.22-0.34) 0.0000000
positive_ratio_transformed positive_ratio_transformed 1.8948182 (1.73-2.08) 0.0000000

According to the univariate Cox regression analysis, the variables age, a_stage, tumor_size, grade, progesterone_status, estrogen_status, and positive_ratioare significantly associated with the survival outcome (p < 0.05).

(Explain Hazard ratio… continuous/ categorical)

Multivariate Cox Regression Model

result_use = result_unicox |>
mutate(
  significant = ifelse(result_unicox$P.value < 0.05,"significant","Not significant"))

multi_cox <- result_use[result_use$significant == "significant",]$Variable

formula <- as.formula(paste("Surv(survival_months, status) ~", paste(multi_cox, collapse = " + ")))

multi_cox_model <- coxph(formula, data = surv_data)

data_use <- summary(multi_cox_model)

multi_cox_HR <- round(data_use$coefficients[,2],2)
multi_cox_CI2.5 <- round(data_use$conf.int[,3],2)
multi_cox_CI97.5 <- mul_CI95<-round(data_use$conf.int[,4],2)
multi_cox_CI <- paste0('(',multi_cox_CI2.5,'-',multi_cox_CI97.5,')')
multi_cox_P_value <- round(data_use$coefficients[,5],3)
Variable <- row.names(data.frame(data_use$coefficients))
multi_cox_result<- data.frame(Variable,multi_cox_HR,multi_cox_CI2.5,multi_cox_CI97.5,multi_cox_CI,multi_cox_P_value)

knitr::kable(multi_cox_result)
Variable multi_cox_HR multi_cox_CI2.5 multi_cox_CI97.5 multi_cox_CI multi_cox_P_value
age age 1.02 1.01 1.03 (1.01-1.03) 0.000
a_stageRegional a_stageRegional 0.60 0.43 0.86 (0.43-0.86) 0.005
tumor_size tumor_size 1.01 1.00 1.01 (1-1.01) 0.000
grade grade 1.58 1.38 1.80 (1.38-1.8) 0.000
progesterone_statusPositive progesterone_statusPositive 0.61 0.50 0.76 (0.5-0.76) 0.000
estrogen_statusPositive estrogen_statusPositive 0.49 0.38 0.64 (0.38-0.64) 0.000
positive_ratio_transformed positive_ratio_transformed 1.70 1.55 1.87 (1.55-1.87) 0.000
ggplot(multi_cox_result, aes(multi_cox_HR, Variable)) + 
  geom_vline(xintercept = 1,
             linetype = "dashed",
             size = 1) +
  geom_errorbar(aes(xmin = multi_cox_CI2.5, xmax = multi_cox_CI97.5),width = 0.1) +
  geom_point(aes(color = multi_cox_P_value),size = 5, shape = 18) +
  scale_color_continuous(low = 'skyblue', high = 'red') +
  labs(x = 'Hazard ratio', title = 'Forest plot for multivariate cox regression') +
  theme_pubr() +
  theme(legend.position = 'right')
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

According to the multivariate Cox regression analysis, the variables age, a_stage, tumor_size, grade, progesterone_status, estrogen_status, and positive_ratio are significantly associated with the survival outcome (p < 0.05).

Nomogram for Survival Prediction